We are generating a new random data based on the frequencies of the original features. This way we get a new random data with the same features. The marginal distribution of a selected feature will be the same as in the original data, but the features in the simulated data will be independent from each other.
import pandas as pd
import numpy as np
import pandas_profiling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
data = pd.read_excel("./data/clean/data_all_clean.xlsx")
data.columns
data.drop(['Unnamed: 0', 'ID', 'Hallgató_ID'], axis=1, inplace=True)
profile = data.profile_report(title='Report on the features of the original data')
profile.to_file(output_file="report_on_features_orig.html")
sim=pd.DataFrame(columns=data.columns)
for i in list(sim):
s = data[i].value_counts()/data[i].notna().sum()
sim[i] = np.random.choice(s.index.tolist(), 10000, p=s)
sim.profile_report(style={"full_width":True})
profile_sim = sim.profile_report(title='Report on the features of the simulated data')
profile_sim.to_file(output_file="report_on_features_simul.html")
Encoding the simulated data with numbers. Later, some variables will encoded with one-hot encoding.
sim_enc = sim.apply(LabelEncoder().fit_transform)
features = set(sim.columns)
features.remove('Statusz_vegzett')
X_train, X_test, y_train, y_test = train_test_split(sim[features], sim['Statusz_vegzett'], test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)
l=[X_train, X_val, X_test, y_train, y_val, y_test]
for df, name in zip(l, ["X_train", "X_val", "X_test", "y_train", "y_val", "y_test"]):
print(df.shape, type(df))
df.to_csv(name+".csv", header=True, sep=";")